#!/bin/bash
###############################################################
##                  注意-- 注意--注意                          ##
##                 K8S NCCL2多机作业作业示例                    ##
###############################################################
job_name=<$ JOB_NAME $>

# 作业参数
group_name="<$ GROUP_NAME $>"               
job_version="paddle-fluid-v1.7.1"
start_cmd="<$ START_CMD $>"
wall_time="2000:00:00"

k8s_priority=<$ K8S_PRIORITY $>
k8s_trainers=<$ K8S_TRAINERS $>
k8s_cpu_cores=<$ K8S_CPU_CORES $>
k8s_gpu_cards=<$ K8S_GPU_CARD $>

is_stand_alone=0
nccl="--distribute-job-type "NCCL2""
if [ ${k8s_trainers} == 1 ];then
    is_stand_alone=1
    nccl="--job-remark single-trainer"
    if [ ${k8s_gpu_cards} == 1];then
        nccl="--job-remark single-gpu"
        echo "Attention: Use single GPU card for PaddleRec distributed training, please set runner class from 'cluster_train' to 'train' in config.yaml."
    fi
fi

# 你的ak/sk（可在paddlecloud web页面【个人中心】处获取）
ak=<$ AK $>
sk=<$ SK $>

paddlecloud job --ak ${ak} --sk ${sk} \
        train --job-name ${job_name} \
        --group-name ${group_name} \
        --job-conf config.ini \
        --start-cmd "${start_cmd}" \
        --files ./*  \
        --job-version ${job_version}  \
        --k8s-trainers ${k8s_trainers} \
        --k8s-cpu-cores ${k8s_cpu_cores} \
        --k8s-gpu-cards ${k8s_gpu_cards} \
        --k8s-priority ${k8s_priority} \
        --wall-time ${wall_time} \
        --is-standalone ${is_stand_alone} \
        --json \
        ${nccl} 
        